import pandas as pd
import sqlalchemy as sa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = "seaborn"
pio.renderers.default = "vscode+notebook"
pd.options.display.max_columns = None
# print current renderer
# create connection to database
engine = sa.create_engine(
# postgres connection url
"postgresql://postgres:pass@localhost/postgres",
# select the schema
connect_args={"options": "-c search_path=hr"},
)
conn = engine.connect()
Don't need to do this if data was already imported some other way.
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# employee_df.to_sql("employee_survey_data", conn, if_exists="replace")
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# general_df.to_sql("general_data", conn, if_exists="replace")
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
# manager_df.to_sql("manager_survey_data", conn, if_exists="replace")
employee_df = pd.read_sql_table("employee_survey_data", conn)
general_df = pd.read_sql_table("general_data", conn)
manager_df = pd.read_sql_table("manager_survey_data", conn)
#######################################
# Alternatively import from csv files #
#######################################
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
display(employee_df.head())
display(general_df.head())
display(manager_df.head())
| EmployeeID | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | |
|---|---|---|---|---|
| 0 | 1 | 3.0 | 4.0 | 2.0 |
| 1 | 2 | 3.0 | 2.0 | 4.0 |
| 2 | 3 | 2.0 | 2.0 | 1.0 |
| 3 | 4 | 4.0 | 4.0 | 3.0 |
| 4 | 5 | 4.0 | 1.0 | 3.0 |
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | EmployeeCount | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | PercentSalaryHike | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | 1 | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | Y | 11 | 8 | 0 | 1.0 | 6 | 1 | 0 | 0 |
| 1 | 2 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | 1 | Female | 1 | Research Scientist | Single | 41890 | 0.0 | Y | 23 | 8 | 1 | 6.0 | 3 | 5 | 1 | 4 |
| 2 | 3 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | 1 | Male | 4 | Sales Executive | Married | 193280 | 1.0 | Y | 15 | 8 | 3 | 5.0 | 2 | 5 | 0 | 3 |
| 3 | 4 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | 1 | Male | 3 | Human Resources | Married | 83210 | 3.0 | Y | 11 | 8 | 3 | 13.0 | 5 | 8 | 7 | 5 |
| 4 | 5 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | 1 | Male | 1 | Sales Executive | Single | 23420 | 4.0 | Y | 12 | 8 | 2 | 9.0 | 2 | 6 | 0 | 4 |
| EmployeeID | JobInvolvement | PerformanceRating | |
|---|---|---|---|
| 0 | 1 | 3 | 3 |
| 1 | 2 | 2 | 4 |
| 2 | 3 | 3 | 3 |
| 3 | 4 | 2 | 3 |
| 4 | 5 | 3 | 3 |
data_df = general_df.merge(employee_df, on="EmployeeID", how="left")
data_df = data_df.merge(manager_df, on="EmployeeID", how="left")
We will check for any null values, and decide if we want to drop them or impute them.
num_null_rows = np.count_nonzero(data_df.isna())
print(f"Number of rows with null values: {num_null_rows}")
print(f"Percent of rows with null values: {num_null_rows/data_df.shape[0]*100:.3f}%")
Number of rows with null values: 111 Percent of rows with null values: 2.517%
There are not very many null values so we can safely drop them without affecting the final analysis.
# drop rows with missing values
data_df = data_df.dropna()
print(f"Duplicate Rows: {data_df.duplicated().sum()}")
Duplicate Rows: 0
There are no duplicate rows in the dataset.
The variables in the employee and manager survey are pretty self explanatory.
ignored_columns = [
"EmployeeID",
]
numerical_columns = [
# "EmployeeID",
"Age",
# "Attrition",
# "BusinessTravel",
# "Department",
"DistanceFromHome",
# "Education",
# "EducationField",
"EmployeeCount",
# "Gender",
# "JobLevel",
# "JobRole",
# "MaritalStatus",
"MonthlyIncome",
"NumCompaniesWorked",
# "Over18",
"PercentSalaryHike",
"StandardHours",
"StockOptionLevel",
"TotalWorkingYears",
"TrainingTimesLastYear",
"YearsAtCompany",
"YearsSinceLastPromotion",
"YearsWithCurrManager",
# 'EnvironmentSatisfaction',
# 'JobSatisfaction',
# 'WorkLifeBalance',
# 'JobInvolvement',
# 'PerformanceRating'
]
categorical_columns = [
var
for var in general_df.columns
if var not in numerical_columns and var not in ignored_columns
]
# # make categorical columns into category type
# for col in categorical_columns:
# data_df[col] = data_df[col].astype("category")
Create functions to visualise numerical data and categorical data
def numerical_vis(data, variable):
from plotly.subplots import make_subplots
hist_fig = px.histogram(data, x=variable, nbins=20)
box_fig = px.box(data, y=variable)
final_fig = make_subplots(rows=1, cols=2)
final_fig.add_trace(hist_fig.data[0], row=1, col=1)
final_fig.add_trace(box_fig.data[0], row=1, col=2)
# title
final_fig.update_layout(title_text=f"Histogram and Boxplot of {variable}")
final_fig.show()
# fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# # hist plot
# sns.histplot(x=data[variable], ax=ax[0], kde=True)
# # box plot
# sns.boxplot(x=data[variable], ax=ax[1])
# # titles
# ax[0].set_title(f"Histogram of {variable}")
# ax[1].set_title(f"Boxplot of {variable}")
# plt.show()
def categorical_vis(data, variable):
# fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# # pie plot
# data[variable].value_counts().plot(kind="pie", ax=ax[0], autopct="%1.1f%%")
# # bar chart
# sns.countplot(x=data[variable], ax=ax[1])
# # titles
# ax[0].set_title(f"Pie chart of {variable}")
# ax[1].set_title(f"Bar chart of {variable}")
pie_fig = px.pie(data, names=variable)
bar_fig = px.histogram(data, x=variable, color=variable)
final_fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'xy'}]])
final_fig.add_trace(pie_fig.data[0], row=1, col=1)
for index in range(len(bar_fig.data)):
final_fig.add_trace(bar_fig.data[index], row=1, col=2)
# barmode
final_fig.update_layout(barmode='stack')
# bargap
final_fig.update_layout(bargap=0.1)
# title
final_fig.update_layout(title_text=f"Distribution of {variable}")
# center title
final_fig.update_layout(title_x=0.5)
# hide legend
final_fig.update_layout(showlegend=False)
# title font size
final_fig.update_layout(title_font_size=20)
final_fig.show()
categorical_vis(data_df, "Attrition")
for var in numerical_columns:
numerical_vis(general_df, var)
From the plots we can see a quick overview of the distribution of the variables.
data_df = data_df.drop(columns=["EmployeeCount", "StandardHours"])
# remove columns from numerical columns
numerical_columns.remove("EmployeeCount")
numerical_columns.remove("StandardHours")
data_df.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | Y | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 |
| 1 | 2 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | Female | 1 | Research Scientist | Single | 41890 | 0.0 | Y | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 |
| 2 | 3 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | Male | 4 | Sales Executive | Married | 193280 | 1.0 | Y | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 |
| 3 | 4 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | Male | 3 | Human Resources | Married | 83210 | 3.0 | Y | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 |
| 4 | 5 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | Male | 1 | Sales Executive | Single | 23420 | 4.0 | Y | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 |
for var in categorical_columns:
categorical_vis(data_df, var)
The distribution of the categorical variables can be seen above.
data_df = data_df.drop(columns=["Over18"])
# remove columns from categorical columns
categorical_columns.remove("Over18")
We will change binary categorical variables to 1, 0 and ordered categorical data to numbers
# ordered categorical data to numerical
data_df["Attrition"] = data_df["Attrition"].map({"Yes": 1, "No": 0})
# data_df["Over18"] = data_df["Over18"].map({"Y": 1, "N": 0})
data_df["BusinessTravel"] = data_df["BusinessTravel"].map(
{"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
)
data_df["BusinessTravel"].value_counts()
data_df.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | 0 | 1 | Sales | 6 | 2 | Life Sciences | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 |
| 1 | 2 | 31 | 1 | 2 | Research & Development | 10 | 1 | Life Sciences | Female | 1 | Research Scientist | Single | 41890 | 0.0 | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 |
| 2 | 3 | 32 | 0 | 2 | Research & Development | 17 | 4 | Other | Male | 4 | Sales Executive | Married | 193280 | 1.0 | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 |
| 3 | 4 | 38 | 0 | 0 | Research & Development | 2 | 5 | Life Sciences | Male | 3 | Human Resources | Married | 83210 | 3.0 | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 |
| 4 | 5 | 32 | 0 | 1 | Research & Development | 10 | 1 | Medical | Male | 1 | Sales Executive | Single | 23420 | 4.0 | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 |
# check correlation
corr = data_df.corr()
px.imshow(
corr,
height=1200,
width=1200,
# min value of the color scale -1
zmin=-1,
# max value of the color scale 1
zmax=1,
# title
title="Correlation Matrix",
# color scale
color_continuous_scale=px.colors.diverging.RdYlGn,
)
def compare_categorical_vis(data, variable):
fig = px.histogram(
data,
x=variable,
color="Attrition",
barmode="overlay",
title=f"Histogram of {variable} by Attrition",
)
# bargap
fig.update_layout(bargap=0.1)
fig.show()
def compare_numerical_vis(data, variable):
hist_fig = px.histogram(data, x=variable, color="Attrition")
box_fig = px.box(data, x="Attrition", y=variable, color="Attrition")
final_fig = make_subplots(rows=1, cols=2)
final_fig.add_trace(hist_fig.data[0], row=1, col=1)
final_fig.add_trace(hist_fig.data[1], row=1, col=1)
final_fig.add_trace(box_fig.data[0], row=1, col=2)
final_fig.add_trace(box_fig.data[1], row=1, col=2)
# hide legend
# final_fig.update_layout(showlegend=False)
# title
final_fig.update_layout(title_text=f"Attrition by {variable}")
# title center
final_fig.update_layout(title_x=0.5)
# barmode
final_fig.update_layout(barmode="overlay")
final_fig.show()
for var in numerical_columns:
compare_numerical_vis(data_df, var)
for var in categorical_columns:
compare_categorical_vis(data_df, var)